knitr::opts_chunk$set(message = FALSE)

library(tidyverse)
library(lubridate)
library(plotly)
library(patchwork)

Data input and cleaning

bakery_df =
  read_csv("./Data/Bakery_sales.csv") %>% 
  janitor::clean_names() %>% 
  mutate(
    unit_price = str_replace(unit_price, "€", ""),
    unit_price = str_replace(unit_price, ",", "."),
    unit_price = as.numeric(unit_price),
    product_name = article) %>% 
  filter(product_name != ".") %>% 
  select(-article)

bakery_df
## # A tibble: 234,000 × 7
##       x1 date       time   ticket_number quantity unit_price product_name       
##    <dbl> <date>     <time>         <dbl>    <dbl>      <dbl> <chr>              
##  1     0 2021-01-02 08:38         150040        1       0.9  BAGUETTE           
##  2     1 2021-01-02 08:38         150040        3       1.2  PAIN AU CHOCOLAT   
##  3     4 2021-01-02 09:14         150041        2       1.2  PAIN AU CHOCOLAT   
##  4     5 2021-01-02 09:14         150041        1       1.15 PAIN               
##  5     8 2021-01-02 09:25         150042        5       1.2  TRADITIONAL BAGUET…
##  6    11 2021-01-02 09:25         150043        2       0.9  BAGUETTE           
##  7    12 2021-01-02 09:25         150043        3       1.1  CROISSANT          
##  8    15 2021-01-02 09:27         150044        1       1.05 BANETTE            
##  9    18 2021-01-02 09:32         150045        3       1.2  TRADITIONAL BAGUET…
## 10    19 2021-01-02 09:32         150045        6       1.1  CROISSANT          
## # … with 233,990 more rows

A pie chart showing the percentage of each product’s sale count (top 10)

plot_pie =
  bakery_df %>% 
  group_by(product_name) %>% 
  summarize(n_obs = n()) %>% 
  arrange(desc(n_obs)) %>% 
  head(10) %>% 
  plot_ly(labels = ~product_name, values = ~n_obs, type = 'pie', colors = "viridis") %>% 
  layout(title = 'Top 10 favoriate bakeries',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))

plot_pie

A pie chart showing the percentage of each product’s sale count (top 10) in 2021/2022

plot_pie =
  bakery_df %>% 
  separate(date, into = c("year", "month", "day"), sep = "-") %>% 
  mutate(year = as.numeric(year),
         month = as.integer(month),
         day = as.integer(day),) 
  
plot_pie_2021 = 
  plot_pie %>% 
  filter(year == 2021) %>%
  group_by(product_name) %>% 
  summarize(n_obs = n()) %>% 
  arrange(desc(n_obs)) %>% 
  head(10) %>% 
  plot_ly(labels = ~product_name, values = ~n_obs, type = 'pie', colors = "viridis") %>% 
  layout(title = 'Top 10 favoriate bakeries in 2021',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))

plot_pie_2022 = 
  plot_pie %>% 
  filter(year == 2022) %>%
  group_by(product_name) %>% 
  summarize(n_obs = n()) %>% 
  arrange(desc(n_obs)) %>% 
  head(10) %>% 
  plot_ly(labels = ~product_name, values = ~n_obs, type = 'pie', colors = "viridis") %>% 
  layout(title = 'Top 10 favoriate bakeries in 2022',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))

plot_pie_2021
plot_pie_2022

top 10 favorite bread monthly sale counts in 2021

top_line =
  bakery_df %>% 
  separate(date, into = c("year", "month", "day"), sep = "-") %>% 
  mutate(year = as.numeric(year),
         month = as.integer(month),
         day = as.integer(day),) %>%
  filter(year == 2021) %>% 
  select (year, month, product_name, quantity) %>% 
  group_by (year, month, product_name) %>% 
  mutate (total_counts = sum (quantity)) %>% 
  distinct(year, month, product_name,total_counts) 

top_line1_2021 =
  top_line %>% 
  filter(product_name %in% c("TRADITIONAL BAGUETTE")) %>% 
  ggplot(aes(x =month, y = total_counts, color = product_name)) + 
    geom_line() +
    theme(legend.position = "bottom") + 
    labs(x = "month",
         y = "counts of bread",
         title = "monthly sale counts of the most popular bread in 2021") +
    scale_x_continuous(
    breaks = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12), 
    labels = c("Jan","Feb","Mar","Apr","May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")) 

top_line2_2021 =
  top_line %>% 
    filter(product_name %in% c("COUPE", "BAGUETTE","BANETTE", "CROISSANT", "PAIN AU CHOCOLAT", "CEREAL BAGUETTE", "SPECIAL BREAD", "CAMPAGNE", "BOULE 400G")) %>% 
  ggplot(aes(x =month, y = total_counts, color = product_name)) + 
    geom_line() +
    theme(legend.position = "bottom") + 
    labs(x = "month",
         y = "counts of bread",
         title = "top 2-10 popular bread monthly sale counts in 2021") +
    scale_x_continuous(
    breaks = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12), 
    labels = c("Jan","Feb","Mar","Apr","May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")) 
 
top_line1_2021+top_line2_2021

top 10 favorite bread monthly sale counts in 2022

top_line =
  bakery_df %>% 
  separate(date, into = c("year", "month", "day"), sep = "-") %>% 
  mutate(year = as.numeric(year),
         month = as.integer(month),
         day = as.integer(day),) %>%
  filter(year == 2022) %>% 
  select (year, month, product_name, quantity) %>% 
  group_by (year, month, product_name) %>% 
  mutate (total_counts = sum (quantity)) %>% 
  distinct(year, month, product_name,total_counts) 

top_line1_2022 =
  top_line %>% 
  filter(product_name %in% c("TRADITIONAL BAGUETTE")) %>% 
  ggplot(aes(x =month, y = total_counts, color = product_name)) + 
    geom_line() +
    theme(legend.position = "bottom") + 
    labs(x = "month",
         y = "counts of bread",
         title = "monthly sale counts of the most popular bread in 2022") +
    scale_x_continuous(
    breaks = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12), 
    labels = c("Jan","Feb","Mar","Apr","May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")) 

top_line2_2022 =
  top_line %>% 
    filter(product_name %in% c("COUPE", "BAGUETTE","BANETTE", "CROISSANT", "PAIN AU CHOCOLAT", "SPECIAL BREAD", "FORMULE SANDWICH","CEREAL BAGUETTE", "BOULE 400G")) %>% 
  ggplot(aes(x =month, y = total_counts, color = product_name)) + 
    geom_line() +
    theme(legend.position = "bottom") + 
    labs(x = "month",
         y = "counts of bread",
         title = "top 2-10 popular bread monthly sale counts in 2022") +
    scale_x_continuous(
    breaks = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12), 
    labels = c("Jan","Feb","Mar","Apr","May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")) 
 
top_line1_2022+top_line2_2022

A line plot of trends showing total sale revenue by month (trend)

plot_line =
  bakery_df %>% 
  separate(date, into = c("year", "month", "day"), sep = "-") %>% 
  mutate(year = as.numeric(year),
         month = as.integer(month),
         day = as.integer(day),) %>% 
  group_by(year, month) %>% 
  mutate(rev = quantity * unit_price) %>% 
  summarize(month_rev = sum(rev)) %>% 
  
  plot_ly(x = ~month, y = ~month_rev, mode = 'lines+markers', alpha = 0.5)

plot_line 

ranking of mean prices of different products

test =
  bakery_df %>% 
  group_by(product_name) %>% 
  summarize(mean_price = mean(unit_price)) %>% 
  arrange(desc(mean_price)) %>% 
  mutate(product_name = fct_reorder(product_name, mean_price)) %>% 
  ggplot(aes(x =product_name, y = mean_price, color = product_name)) + 
    geom_point() +
    theme(axis.text.x = element_text(angle=90,hjust = 1)) + 
    labs(x = "name of products",
         y = "mean prices",
         title = "ranking of mean prices of different products") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "none")
  
test 

ranking of total revenue of top 10 products in 2021/2022

total_rev_bar=
  bakery_df %>% 
  group_by(product_name) %>% 
  mutate(rev = quantity * unit_price) %>% 
  summarize(prod_rev = sum(rev)) %>% 
  arrange(desc(prod_rev)) 
  
total_rev_bar_2021=
  total_rev_bar %>% 
  filter(product_name %in% c("TRADITIONAL BAGUETTE", "COUPE", "BAGUETTE","BANETTE", "CROISSANT", "PAIN AU CHOCOLAT", "CEREAL BAGUETTE", "SPECIAL BREAD", "CAMPAGNE", "BOULE 400G")) %>% 
  mutate(product_name = fct_reorder(product_name, prod_rev)) %>% 
  ggplot(aes(x =product_name, y= prod_rev, fill=product_name)) + 
    geom_bar(stat="identity", width=0.5) +
    labs(x = "name of products",
         y = "total revenue",
         title = "ranking in 2021") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
  
total_rev_bar_2022=
  total_rev_bar %>% 
  filter(product_name %in% c("TRADITIONAL BAGUETTE", "COUPE", "BAGUETTE","BANETTE", "CROISSANT", "PAIN AU CHOCOLAT", "SPECIAL BREAD", "FORMULE SANDWICH","CEREAL BAGUETTE", "BOULE 400G")) %>% 
  mutate(product_name = fct_reorder(product_name, prod_rev)) %>% 
  ggplot(aes(x =product_name, y= prod_rev, fill=product_name)) + 
    geom_bar(stat="identity", width=0.5) +
    labs(x = "name of products",
         y = "total revenue",
         title = "ranking in 2022") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
 
total_rev_bar_2021+total_rev_bar_2022

distribution of monthly revenue in 2021/2022

dis_rev_bar=
  bakery_df %>% 
  separate(date, into = c("year", "month", "day"), sep = "-") %>% 
  mutate(year = as.numeric(year),
         month = as.integer(month),
         day = as.integer(day),) 

dis_rev_bar_2021=
  dis_rev_bar %>%
    filter(year == 2021,
         product_name %in% c("TRADITIONAL BAGUETTE", "COUPE", "BAGUETTE","BANETTE", "CROISSANT", "PAIN AU CHOCOLAT", "CEREAL BAGUETTE", "SPECIAL BREAD", "CAMPAGNE", "BOULE 400G")) %>% 
  group_by(year, month, product_name) %>% 
  mutate(rev = quantity * unit_price) %>% 
  summarize(prod_dis_rev = sum(rev)) %>% 
  arrange(desc(prod_dis_rev)) %>% 
   mutate(product_name = fct_reorder(product_name, prod_dis_rev)) %>% 
    ggplot(aes(x =month, y= prod_dis_rev, fill=product_name)) + 
    geom_bar(stat="identity", position="stack") +
    labs(x = "month",
         y = "total revenue",
         title = "distribution of monthly revenue in 2021") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "bottom") +
  scale_x_continuous(
    breaks = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12), 
    labels = c("Jan","Feb","Mar","Apr","May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")) 

dis_rev_bar_2022=
  dis_rev_bar %>%
    filter(year == 2022,
         product_name %in% c("TRADITIONAL BAGUETTE", "COUPE", "BAGUETTE","BANETTE", "CROISSANT", "PAIN AU CHOCOLAT", "SPECIAL BREAD", "FORMULE SANDWICH","CEREAL BAGUETTE", "BOULE 400G")) %>% 
  group_by(year, month, product_name) %>% 
  mutate(rev = quantity * unit_price) %>% 
  summarize(prod_dis_rev = sum(rev)) %>% 
  arrange(desc(prod_dis_rev)) %>% 
   mutate(product_name = fct_reorder(product_name, prod_dis_rev)) %>% 
    ggplot(aes(x =month, y= prod_dis_rev, fill=product_name)) + 
    geom_bar(stat="identity", position="stack") +
    labs(x = "month",
         y = "total revenue",
         title = "distribution of monthly revenue in 2022") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "bottom") +
  scale_x_continuous(
    breaks = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12), 
    labels = c("Jan","Feb","Mar","Apr","May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")) 

dis_rev_bar_2021+dis_rev_bar_2022